{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Task \n",
    "\n",
    "This notebooks cleans up PSID data. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = '../data_sources/psid/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_stata(data_path+'pequiv_1989.dta',convert_categoricals=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns = [c.replace('_1989','') for c in df.columns]\n",
    "df['year'] = 1989"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "for t in range(1990,1998):\n",
    "\tdft = pd.read_stata(data_path+'pequiv_'+str(t)+'.dta',convert_categoricals=False)\n",
    "\tdft.columns = [c.replace('_'+str(t),'') for c in dft.columns]\n",
    "\tdft['year'] = t\n",
    "\tdf = df.append(dft)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1989    19669\n",
       "1990    19932\n",
       "1991    19962\n",
       "1992    20334\n",
       "1993    21450\n",
       "1994    23620\n",
       "1995    23182\n",
       "1996    23060\n",
       "1997    19132\n",
       "Name: year, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['year'].value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df['d11105']==1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.rename({'x11101ll':'pid'},axis=1)\n",
    "df = df.rename({'d11101':'age'},axis=1)\n",
    "df = df.rename({'i11113':'Y'},axis=1)\n",
    "df['co'] = 'US'\n",
    "df = df[['co','pid','year','age','Y']]\n",
    "df = df[df['Y']>0]\n",
    "df['Y'] = df['Y']/df['Y'].mean()\n",
    "up = df['Y'].quantile(0.975)\n",
    "low = df['Y'].quantile(0.025)\n",
    "df['Y'] = df['Y'].clip(low,up)\n",
    "df['logY'] = np.where((df['Y'].isna()==False),np.log(df['Y']),np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>co</th>\n",
       "      <th>pid</th>\n",
       "      <th>year</th>\n",
       "      <th>age</th>\n",
       "      <th>Y</th>\n",
       "      <th>logY</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>US</td>\n",
       "      <td>4001</td>\n",
       "      <td>1989</td>\n",
       "      <td>67.0</td>\n",
       "      <td>0.720443</td>\n",
       "      <td>-0.327888</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>US</td>\n",
       "      <td>4003</td>\n",
       "      <td>1989</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0.900614</td>\n",
       "      <td>-0.104679</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>US</td>\n",
       "      <td>4008</td>\n",
       "      <td>1989</td>\n",
       "      <td>24.0</td>\n",
       "      <td>0.684175</td>\n",
       "      <td>-0.379542</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>US</td>\n",
       "      <td>4170</td>\n",
       "      <td>1989</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.789620</td>\n",
       "      <td>-0.236204</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>US</td>\n",
       "      <td>4172</td>\n",
       "      <td>1989</td>\n",
       "      <td>40.0</td>\n",
       "      <td>1.564385</td>\n",
       "      <td>0.447493</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    co   pid  year   age         Y      logY\n",
       "0   US  4001  1989  67.0  0.720443 -0.327888\n",
       "2   US  4003  1989  38.0  0.900614 -0.104679\n",
       "6   US  4008  1989  24.0  0.684175 -0.379542\n",
       "15  US  4170  1989  34.0  0.789620 -0.236204\n",
       "16  US  4172  1989  40.0  1.564385  0.447493"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "year\n",
       "1989    0.845942\n",
       "1990    0.873504\n",
       "1991    0.903037\n",
       "1992    0.927812\n",
       "1993    0.972388\n",
       "1994    0.943375\n",
       "1995    0.981465\n",
       "1996    1.035376\n",
       "1997    1.152582\n",
       "Name: Y, dtype: float64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby('year').mean()['Y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['wave'] = df['year'] - 1989 + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_pickle(data_path+'psid_incomes.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "cf2a50979671a58939829e6829efb726aa5da11149213b77bd50351f899d04fb"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
